source("functions.R")

R libraries

library(GenomicAlignments)
library(Rsamtools)
library(tidyverse)
library(Rsubread)
library(magrittr)

RNA-seq quantification

design table

# design table 
design_table <- read_table_in("RNASEQ_QUANTIFICATION/design_table.tsv")
DT::datatable(design_table, options = list(scrollX = TRUE))

gene level quantification (TPMs)

# gene level quantification (TPMs)
quant_gene_level <- read_table_in("RNASEQ_QUANTIFICATION/merged_quantification_genes.tsv")
DT::datatable(quant_gene_level[1:100,], options = list(scrollX = TRUE))

transcript level quantification (TPMs)

# transcript level quantification (TPMs)
quant_transcript_level <- read_table_in("RNASEQ_QUANTIFICATION/merged_quantification_transcripts.tsv")
DT::datatable(quant_transcript_level[1:100,], options = list(scrollX = TRUE))

ATAC-seq

design table

# design table
DT::datatable(read.csv("ATACSEQ/design.csv"),options = list(scrollX = TRUE)) 

Files with all regions

Output from nf-core ATACseq pipeline (https://nf-co.re/atacseq)

# bam/bai
ATACSEQ/all_regions/bam

# bigwig
ATACSEQ/all_regions/bigwig

# macs2 narrow peaks
ATACSEQ/all_regions/narrowPeak

# qc summary
ATACSEQ/all_regions/multiqc_data

“Nucleosome free” files (insert size < 100bp)

I filtered bams from ATACSEQ/all_regions/bam with samtools and did peak calling on them:

# filtereing bams with samtools and awk
samtools view -h ${f} | \
  awk 'substr($0,1,1)=="@" || ($9<= 100 && $9>0) || ($9>=-100 && $9<0)' | \
  samtools view -@ 8 -b > ./bam/$sample.ncfree.bam
samtools sort -@ 8 ./bam/$sample.ncfree.bam -o ./bam/$sample.ncfree.sorted.bam
samtools flagstat -@ 8 ./bam/$sample.ncfree.sorted.bam > ./bam/$sample.ncfree.sorted.bam.stats
samtools index ./bam/$sample.ncfree.sorted.bam

# macs2 peak calling
macs2 callpeak -t ${f} -f BAMPE --outdir macs2_peaks --name ${sample} -g 269400000

# bam/bai
ATACSEQ/nucleosome_free_regions/bam

# macs2 narrow peaks
ATACSEQ/nucleosome_free_regions/macs2_peaks

Insert sizes before /after

before ‘all regions’

library(GenomicAlignments)
library(tidyverse)

atacReads <- readGAlignmentPairs(
  "ATACSEQ/all_regions/bam/Elav_neg_R1.mLb.clN.sorted.bam", 
  param = ScanBamParam(mapqFilter = 1, 
                       flag = scanBamFlag(isPaired = TRUE, 
                                          isProperPair = TRUE), 
                       what = c("qname", "mapq", "isize")))

atacReads_read1 <- GenomicAlignments::first(atacReads)
insertSizes <- abs(elementMetadata(atacReads_read1)$isize)
head(insertSizes)
## [1] 280 441  75  65 207 341
fragLenPlot <- table(insertSizes) %>% data.frame %>% rename(InsertSize = insertSizes, 
    Count = Freq) %>% mutate(InsertSize = as.numeric(as.vector(InsertSize)), 
    Count = as.numeric(as.vector(Count))) %>% ggplot(aes(x = InsertSize, y = Count)) + 
    geom_line()

fragLenPlot + geom_vline(xintercept = c(180, 247), colour = "red") + 
  geom_vline(xintercept = c(315, 437), colour = "darkblue") + 
  geom_vline(xintercept = c(100), colour = "darkgreen") + 
  theme_bw()

after ‘nucleosome_free_regions’

library(GenomicAlignments)
library(tidyverse)

atacReads <- readGAlignmentPairs(
  "ATACSEQ/nucleosome_free_regions/bam/Elav_neg_R1.mLb.clN.ncfree.bam", 
  param = ScanBamParam(mapqFilter = 1, 
                       flag = scanBamFlag(isPaired = TRUE, 
                                          isProperPair = TRUE), 
                       what = c("qname", "mapq", "isize")))

atacReads_read1 <- GenomicAlignments::first(atacReads)
insertSizes <- abs(elementMetadata(atacReads_read1)$isize)
head(insertSizes)
## [1] 75 65 48 61 57 43
fragLenPlot <- table(insertSizes) %>% data.frame %>% rename(InsertSize = insertSizes, 
    Count = Freq) %>% mutate(InsertSize = as.numeric(as.vector(InsertSize)), 
    Count = as.numeric(as.vector(Count))) %>% ggplot(aes(x = InsertSize, y = Count)) + 
    geom_line()

fragLenPlot + geom_vline(xintercept = c(180, 247), colour = "red") + 
  geom_vline(xintercept = c(315, 437), colour = "darkblue") + 
  geom_vline(xintercept = c(100), colour = "darkgreen") + 
  theme_bw()

Distribution of the mapped reads

path = "ATACSEQ/nucleosome_free_regions/bam/"
for(i in list.files(path, pattern = ".sorted.bam$")){
  print(idxstatsBam(paste0(path,i)) %>% 
          ggplot(aes(seqnames, mapped, fill = seqnames)) + 
          geom_bar(stat = "identity") + coord_flip() + ggtitle(i))}

Session info

sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] forcats_0.5.2               stringr_1.4.1              
##  [3] dplyr_1.0.10                purrr_0.3.4                
##  [5] readr_2.1.2                 tidyr_1.2.1                
##  [7] tibble_3.1.8                ggplot2_3.3.6              
##  [9] tidyverse_1.3.2             GenomicAlignments_1.32.1   
## [11] Rsamtools_2.12.0            Biostrings_2.64.1          
## [13] XVector_0.36.0              SummarizedExperiment_1.26.1
## [15] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [17] matrixStats_0.62.0          GenomicRanges_1.48.0       
## [19] GenomeInfoDb_1.32.4         IRanges_2.30.1             
## [21] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           fs_1.5.2               lubridate_1.8.0       
##  [4] httr_1.4.4             tools_4.2.2            backports_1.4.1       
##  [7] bslib_0.4.0            utf8_1.2.2             R6_2.5.1              
## [10] DT_0.25                DBI_1.1.3              colorspace_2.0-3      
## [13] withr_2.5.0            tidyselect_1.1.2       compiler_4.2.2        
## [16] rvest_1.0.3            cli_3.4.1              xml2_1.3.3            
## [19] DelayedArray_0.22.0    labeling_0.4.2         sass_0.4.2            
## [22] scales_1.2.1           digest_0.6.29          rmarkdown_2.16        
## [25] pkgconfig_2.0.3        htmltools_0.5.3        highr_0.9             
## [28] dbplyr_2.2.1           fastmap_1.1.0          htmlwidgets_1.5.4     
## [31] rlang_1.0.6            readxl_1.4.1           rstudioapi_0.14       
## [34] farver_2.1.1           jquerylib_0.1.4        generics_0.1.3        
## [37] jsonlite_1.8.0         crosstalk_1.2.0        BiocParallel_1.30.3   
## [40] googlesheets4_1.0.1    RCurl_1.98-1.8         magrittr_2.0.3        
## [43] GenomeInfoDbData_1.2.8 Matrix_1.5-1           munsell_0.5.0         
## [46] fansi_1.0.3            lifecycle_1.0.2        stringi_1.7.8         
## [49] yaml_2.3.5             zlibbioc_1.42.0        grid_4.2.2            
## [52] parallel_4.2.2         crayon_1.5.1           lattice_0.20-45       
## [55] haven_2.5.1            hms_1.1.2              knitr_1.40            
## [58] pillar_1.8.1           codetools_0.2-18       reprex_2.0.2          
## [61] glue_1.6.2             evaluate_0.16          modelr_0.1.9          
## [64] vctrs_0.4.1            tzdb_0.3.0             cellranger_1.1.0      
## [67] gtable_0.3.1           assertthat_0.2.1       cachem_1.0.6          
## [70] xfun_0.33              broom_1.0.1            googledrive_2.0.0     
## [73] gargle_1.2.1           ellipsis_0.3.2